package cn.com.infcn.spider.util;

import java.io.File;
import java.io.InputStream;

import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
import org.nlpcn.commons.lang.util.IOUtil;

import com.alibaba.fastjson.JSONObject;

/**
 * 文件解析
 * 
 * @author ansj
 *
 */
public class FileParser {
	public static JSONObject parser(File file) {

		JSONObject result = new JSONObject();

		String uri = file.getAbsolutePath();

		result.put("uri", uri);
		result.put("fileName", file.getName());
		result.put("fileSize", String.valueOf(file.length()));

		Parser parser = new AutoDetectParser();

		try (InputStream is = IOUtil.getInputStream(uri)) {
			BodyContentHandler hand = new BodyContentHandler(Integer.MAX_VALUE);
			Metadata metadata = new Metadata();
			parser.parse(is, hand, metadata, new ParseContext());

			for (String meta : metadata.names()) {
				result.put(meta, metadata.get(meta));
			}
			result.put("content", hand.toString());
		} catch (Exception e) {
			result.put("message", e.getMessage());
			e.printStackTrace();
		}
		return result;
	}
}
